Word embeddings

Imports

Import dependencies



In [1]:

    
%%bash
ls | grep .csv









    



emails.csv
emails.csv.zip



In [2]:

    
# %%bash
# pip3 install bokeh



In [3]:

    
# built-in libs
import email

# processing libs
import pandas as pd

# display libs
from tqdm import tqdm_notebook

Import data



In [4]:

    
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)



In [5]:

    
print(emails_df.shape)
emails_df.head()









    



(10000, 2)






    Out[5]:







  
    
      
      file
      message
    
  
  
    
      0
      allen-p/_sent_mail/1.
      Message-ID: <18782981.1075855378110.JavaMail.e...
    
    
      1
      allen-p/_sent_mail/10.
      Message-ID: <15464986.1075855378456.JavaMail.e...
    
    
      2
      allen-p/_sent_mail/100.
      Message-ID: <24216240.1075855687451.JavaMail.e...
    
    
      3
      allen-p/_sent_mail/1000.
      Message-ID: <13505866.1075863688222.JavaMail.e...
    
    
      4
      allen-p/_sent_mail/1001.
      Message-ID: <30922949.1075863688243.JavaMail.e...



In [6]:

    
emails_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
file       10000 non-null object
message    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB



In [7]:

    
%time
messages_obj_lst = []
messages_str_lst = []

message_metadata = {}

for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
        else:
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    
    payload = msg.get_payload() # decode=True
    
    messages_obj_lst.append(msg)
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
    #    break

print('messages_obj_lst size: %i' % len(messages_obj_lst))









    



CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.53 µs






    





 
 










    



messages_obj_lst size: 10000



In [8]:

    
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)

# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')



In [9]:

    
emails_df.head()









    Out[9]:







  
    
      
      file
      message
      message_obj
      payload
    
  
  
    
      0
      allen-p/_sent_mail/1.
      Message-ID: <18782981.1075855378110.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Here is our forecast
    
    
      1
      allen-p/_sent_mail/10.
      Message-ID: <15464986.1075855378456.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Traveling to have a business meeting takes the...
    
    
      2
      allen-p/_sent_mail/100.
      Message-ID: <24216240.1075855687451.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      test successful.  way to go!!!
    
    
      3
      allen-p/_sent_mail/1000.
      Message-ID: <13505866.1075863688222.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Randy, Can you send me a schedule of the salar...
    
    
      4
      allen-p/_sent_mail/1001.
      Message-ID: <30922949.1075863688243.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Let's shoot for Tuesday at 11:45.



In [10]:

    
# del messages_obj_lst
# del messages_str_lst

emails_df.drop('message', axis=1, inplace=True)



In [ ]:



In [11]:

    
corpus_text = '\n'.join(emails_df[:50000]['payload'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]



In [12]:

    
def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]



In [ ]:

Modelling



In [13]:

    
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)



In [14]:

    
vectors = model.wv
# del model



In [15]:

    
vectors['good']









    Out[15]:





array([-0.270508  , -0.17306764,  1.6283128 ,  0.0789329 ,  0.31106964,
        0.769532  ,  1.2730443 , -0.8092405 ,  0.7060038 ,  0.86828023,
       -2.6277056 , -1.3929644 ,  0.6448156 , -0.7771182 , -1.6537852 ,
       -0.4743401 , -1.1166382 ,  1.1569368 , -1.1398625 ,  0.80520093,
       -1.7167239 , -1.5579057 ,  0.10402635,  3.0918787 , -0.0558991 ,
       -0.43233722, -2.051206  , -0.66570055,  1.5504636 , -0.2648149 ,
        0.11560618, -0.32946193, -0.372461  , -0.781641  ,  1.0626622 ,
       -0.5553393 ,  0.5192849 ,  2.4005246 ,  0.05692073, -2.3076432 ,
       -1.5484774 , -0.67129016,  1.7084714 ,  0.68807465, -0.2931756 ,
        0.6166011 ,  1.0874461 , -0.32894936, -2.7945118 ,  0.0930008 ,
       -0.46457678,  1.2848035 , -1.4603778 ,  0.22172059, -0.99450624,
       -1.0969896 , -2.3467455 ,  0.4534696 ,  0.4488058 , -0.7499471 ,
        1.3231988 ,  1.7316022 ,  0.3932503 ,  0.06664114,  0.47186232,
        2.9767272 , -0.49195403,  2.0907822 ,  1.1899747 ,  1.2920406 ,
        0.4943122 , -1.3712525 ,  0.35063776, -1.9195726 ,  1.0069174 ,
       -2.0902    , -0.33823916, -1.9204639 , -0.7886482 ,  2.2157645 ,
       -0.8052555 ,  0.6665139 , -1.1551962 , -0.8287558 ,  0.05057469,
        2.554974  , -0.62844616,  1.7241517 , -0.09033989,  0.4090363 ,
        0.8304872 , -0.9974313 ,  0.94597244, -0.65530026,  0.4551282 ,
        0.7032987 , -0.13521333, -1.385317  , -0.15468638,  0.7746631 ],
      dtype=float32)



In [16]:

    
print(vectors.similarity('you', 'your'))
print(vectors.similarity('you', 'internet'))









    



0.4465897
0.33204436






    



/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [17]:

    
vectors.most_similar('kill')









    



/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):






    Out[17]:





[('correspondence', 0.7597194910049438),
 ('havestructured', 0.7563828825950623),
 ('throw', 0.7550570964813232),
 ('=with', 0.7486575841903687),
 ('refight', 0.7483857870101929),
 ('thinkof', 0.7468332648277283),
 ('do.>', 0.7420369386672974),
 ('visitors', 0.741276204586029),
 ('nik', 0.7395448684692383),
 ('pleaded', 0.7374611496925354)]



In [18]:

    
len(model.wv.vocab)









    Out[18]:





52543



In [19]:

    
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
# print(ordered_terms)
# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(model.wv.syn0norm[term_indices, :], index=ordered_terms)

word_vectors.head(3)









    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `syn0norm` (Attribute will be removed in 4.0.0, use self.wv.vectors_norm instead).
  del sys.path[0]






    Out[19]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
    
  
  
    
      
      0.103211
      -0.001504
      0.051223
      0.082046
      -0.015313
      -0.031139
      0.239088
      -0.023072
      0.260240
      0.007476
      ...
      -0.152058
      -0.074253
      -0.112685
      -0.128899
      -0.013398
      -0.003880
      0.017925
      -0.033080
      0.026232
      0.085833
    
    
      the
      0.113068
      0.025233
      0.103621
      0.153182
      0.158703
      -0.112839
      0.170684
      -0.030864
      -0.080816
      0.092015
      ...
      0.139859
      -0.013526
      -0.067141
      -0.132761
      0.028204
      0.078747
      0.030681
      -0.142466
      0.065911
      -0.094258
    
    
      to
      -0.170276
      0.023519
      -0.007925
      0.020925
      0.152509
      -0.023285
      0.163287
      -0.003122
      -0.051574
      -0.069945
      ...
      0.040197
      0.071331
      0.091307
      -0.199199
      -0.065438
      0.018316
      -0.014435
      -0.107577
      0.190768
      -0.028655
    
  

3 rows × 100 columns



In [20]:

    
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in model.most_similar(positive=[token], topn=topn):
        print (word, round(similarity, 3))



In [21]:

    
get_related_terms(u'illegal')









    



societe 0.8
criminal 0.795
prohibited.neither 0.785
disclosureby> 0.784
representations 0.779
intensify 0.771
>others 0.769
asinformation 0.767
unauthorized 0.767
byothers 0.764






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [22]:

    
get_related_terms(u'killed')









    



nero 0.861
howthis 0.824
al,i 0.824
beleivehowthis 0.815
rivera 0.813
after-what 0.807
disasterthanksgiving 0.797
mullick 0.795
asti 0.793
$175.00 0.792






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [23]:

    
get_related_terms(u'contract')









    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):






    



bridge 0.751
agreement 0.743
transaction 0.737
partnerwill 0.725
unit 0.717
dead 0.712
maximum 0.709
package 0.707
fee 0.705
bond 0.701



In [24]:

    
get_related_terms(u'fired')









    



nat 0.731
natural 0.697
swap 0.692
disaster 0.669
strip 0.661
vpenanat 0.654
curve.3 0.647
emit 0.643
year> 0.639
projs 0.636






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [25]:

    
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print(term)



In [26]:

    
word_algebra(add=[u'i', u'will'])









    



plans






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [27]:

    
word_algebra(add=[u'you', u'will'])









    



them






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [28]:

    
word_algebra(add=[u'i', u'am'])









    



i'm






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [29]:

    
word_algebra(add=[u'mother', u'fuck'])









    



<jmcvey@exhibitworks.com>






    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [ ]:



In [30]:

    
from sklearn.manifold import TSNE



In [31]:

    
tsne_input = word_vectors
tsne_input = tsne_input.head(5000)



In [32]:

    
tsne_input[:2]









    Out[32]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
    
  
  
    
      
      0.103211
      -0.001504
      0.051223
      0.082046
      -0.015313
      -0.031139
      0.239088
      -0.023072
      0.260240
      0.007476
      ...
      -0.152058
      -0.074253
      -0.112685
      -0.128899
      -0.013398
      -0.003880
      0.017925
      -0.033080
      0.026232
      0.085833
    
    
      the
      0.113068
      0.025233
      0.103621
      0.153182
      0.158703
      -0.112839
      0.170684
      -0.030864
      -0.080816
      0.092015
      ...
      0.139859
      -0.013526
      -0.067141
      -0.132761
      0.028204
      0.078747
      0.030681
      -0.142466
      0.065911
      -0.094258
    
  

2 rows × 100 columns



In [33]:

    
%%time
tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)









    



CPU times: user 1min 27s, sys: 6.4 s, total: 1min 33s
Wall time: 1min 33s



In [34]:

    
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

tsne_vectors.head()



In [35]:

    
tsne_vectors[u'word'] = tsne_vectors.index



In [36]:

    
tsne_vectors.head()



In [37]:

    
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()









    





    
        
        Loading BokehJS ...



In [38]:

    
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);



In [ ]:



In [ ]:

Bibliography



In [ ]:

	x_coord	y_coord
	-43.682186	33.196354
the	60.936298	-31.871368
to	55.425861	-30.385191
and	33.788177	10.613073
of	35.482513	7.884809

	file	message
0	allen-p/_sent_mail/1.	Message-ID: <18782981.1075855378110.JavaMail.e...
1	allen-p/_sent_mail/10.	Message-ID: <15464986.1075855378456.JavaMail.e...
2	allen-p/_sent_mail/100.	Message-ID: <24216240.1075855687451.JavaMail.e...
3	allen-p/_sent_mail/1000.	Message-ID: <13505866.1075863688222.JavaMail.e...
4	allen-p/_sent_mail/1001.	Message-ID: <30922949.1075863688243.JavaMail.e...

	file	message	message_obj	payload
0	allen-p/_sent_mail/1.	Message-ID: <18782981.1075855378110.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Here is our forecast
1	allen-p/_sent_mail/10.	Message-ID: <15464986.1075855378456.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Traveling to have a business meeting takes the...
2	allen-p/_sent_mail/100.	Message-ID: <24216240.1075855687451.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	test successful. way to go!!!
3	allen-p/_sent_mail/1000.	Message-ID: <13505866.1075863688222.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Randy, Can you send me a schedule of the salar...
4	allen-p/_sent_mail/1001.	Message-ID: <30922949.1075863688243.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Let's shoot for Tuesday at 11:45.

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
	0.103211	-0.001504	0.051223	0.082046	-0.015313	-0.031139	0.239088	-0.023072	0.260240	0.007476	...	-0.152058	-0.074253	-0.112685	-0.128899	-0.013398	-0.003880	0.017925	-0.033080	0.026232	0.085833
the	0.113068	0.025233	0.103621	0.153182	0.158703	-0.112839	0.170684	-0.030864	-0.080816	0.092015	...	0.139859	-0.013526	-0.067141	-0.132761	0.028204	0.078747	0.030681	-0.142466	0.065911	-0.094258
to	-0.170276	0.023519	-0.007925	0.020925	0.152509	-0.023285	0.163287	-0.003122	-0.051574	-0.069945	...	0.040197	0.071331	0.091307	-0.199199	-0.065438	0.018316	-0.014435	-0.107577	0.190768	-0.028655